{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>B</th>\n",
       "      <th>LSTAT</th>\n",
       "      <th>MEDV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00632</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.31</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>6.575</td>\n",
       "      <td>65.2</td>\n",
       "      <td>4.0900</td>\n",
       "      <td>1</td>\n",
       "      <td>296</td>\n",
       "      <td>15.3</td>\n",
       "      <td>396.90</td>\n",
       "      <td>4.98</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02731</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>6.421</td>\n",
       "      <td>78.9</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2</td>\n",
       "      <td>242</td>\n",
       "      <td>17.8</td>\n",
       "      <td>396.90</td>\n",
       "      <td>9.14</td>\n",
       "      <td>21.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.02729</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>7.185</td>\n",
       "      <td>61.1</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2</td>\n",
       "      <td>242</td>\n",
       "      <td>17.8</td>\n",
       "      <td>392.83</td>\n",
       "      <td>4.03</td>\n",
       "      <td>34.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.03237</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>6.998</td>\n",
       "      <td>45.8</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>18.7</td>\n",
       "      <td>394.63</td>\n",
       "      <td>2.94</td>\n",
       "      <td>33.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.06905</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>7.147</td>\n",
       "      <td>54.2</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>18.7</td>\n",
       "      <td>396.90</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \\\n",
       "0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   \n",
       "1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   \n",
       "2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   \n",
       "3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   \n",
       "4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   \n",
       "\n",
       "        B  LSTAT  MEDV  \n",
       "0  396.90   4.98  24.0  \n",
       "1  396.90   9.14  21.6  \n",
       "2  392.83   4.03  34.7  \n",
       "3  394.63   2.94  33.4  \n",
       "4  396.90    NaN  36.2  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load data\n",
    "housing_df = pd.read_csv('HousingData.csv')\n",
    "housing_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop null values\n",
    "housing_df = housing_df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# declare X and y\n",
    "X = housing_df.iloc[:,:-1]\n",
    "y = housing_df.iloc[:, -1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Create training and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Create the regressor: reg\n",
    "reg = LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Fit the regressor to the training data\n",
    "reg.fit(X_train, y_train )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predict on the test data: y_pred\n",
    "y_pred = reg.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5.101992863274115\n"
     ]
    }
   ],
   "source": [
    "# Compute and print RMSE\n",
    "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
    "print(rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#Exercise146 begins from here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def regression_model_cv(model, k=5):\n",
    "    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)\n",
    "    rmse = np.sqrt(-scores)\n",
    "    print('Reg rmse:', rmse)\n",
    "    print('Reg mean:', rmse.mean ())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [3.26123843 4.42712448 5.66151114 8.09493087 5.24453989]\n",
      "Reg mean: 5.337868962878366\n"
     ]
    }
   ],
   "source": [
    "regression_model_cv(LinearRegression())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [ 3.72504914  6.01655701 23.20863933]\n",
      "Reg mean: 10.983415161090742\n"
     ]
    }
   ],
   "source": [
    "regression_model_cv(LinearRegression(), k=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [3.23879491 3.97041949 5.58329663 3.92861033 9.88399671 3.91442679]\n",
      "Reg mean: 5.086590810801108\n"
     ]
    }
   ],
   "source": [
    "regression_model_cv(LinearRegression(), k=6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#Exercise147 begins from here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [ 8.24568226  8.81322798 10.58043836  8.85643441  5.98100069]\n",
      "Reg mean: 8.495356738515685\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "regression_model_cv(KNeighborsRegressor())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [ 8.44659788  8.99814547 10.97170231  8.86647969  5.72114135]\n",
      "Reg mean: 8.600813339223432\n"
     ]
    }
   ],
   "source": [
    "regression_model_cv(KNeighborsRegressor(n_neighbors=4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [ 7.99710601  8.68309183 10.66332898  8.90261573  5.51032355]\n",
      "Reg mean: 8.351293217401393\n"
     ]
    }
   ],
   "source": [
    "regression_model_cv(KNeighborsRegressor(n_neighbors=7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [ 7.47549287  8.62914556 10.69543822  8.91330686  6.52982222]\n",
      "Reg mean: 8.448641147609868\n"
     ]
    }
   ],
   "source": [
    "regression_model_cv(KNeighborsRegressor(n_neighbors=10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#Exercise148 begins from here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "neighbors = np.linspace(1, 20, 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = neighbors.astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = {'n_neighbors': k}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "knn = KNeighborsRegressor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "knn_tuned = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "          weights='uniform'),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
       "       18, 19, 20])},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring='neg_mean_squared_error', verbose=0)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "knn_tuned.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best n_neighbors: {'n_neighbors': 7}\n",
      "Best score: 8.523048500643897\n"
     ]
    }
   ],
   "source": [
    "k = knn_tuned.best_params_\n",
    "print(\"Best n_neighbors: {}\".format(k))\n",
    "score = knn_tuned.best_score_\n",
    "rsm = np.sqrt(-score)\n",
    "print(\"Best score: {}\".format(rsm))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#Exercise149 begins from here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [3.76418835 7.09740548 7.43743525 6.65318045 5.60443963]\n",
      "Reg mean: 6.111329831868544\n"
     ]
    }
   ],
   "source": [
    "from sklearn import tree\n",
    "regression_model_cv(tree.DecisionTreeRegressor(random_state = 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [3.48722515 4.18351663 4.56892642 6.89436901 4.00974326]\n",
      "Reg mean: 4.6287560937021865\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "regression_model_cv(RandomForestRegressor(random_state = 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#Exercise150 begins from here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [3.21308458 3.70807446 4.92826249 6.61422343 3.76347521]\n",
      "Reg mean: 4.445424035107399\n"
     ]
    }
   ],
   "source": [
    "regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=100, random_state = 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = {'max_depth': [None, 10, 30, 50, 70, 100, 200, 400],\n",
    "             'min_samples_split': [2, 3, 4, 5],\n",
    "             'min_samples_leaf': [1, 2, 3],\n",
    "             'max_features': ['auto', 'sqrt']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "reg = RandomForestRegressor(n_jobs = -1, random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "reg_tuned = RandomizedSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=5, error_score='raise',\n",
       "          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "           max_features='auto', max_leaf_nodes=None,\n",
       "           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "           min_samples_leaf=1, min_samples_split=2,\n",
       "           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n",
       "           oob_score=False, random_state=0, verbose=0, warm_start=False),\n",
       "          fit_params=None, iid=True, n_iter=10, n_jobs=1,\n",
       "          param_distributions={'max_depth': [None, 10, 30, 50, 70, 100, 200, 400], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3], 'max_features': ['auto', 'sqrt']},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score='warn', scoring='neg_mean_squared_error',\n",
       "          verbose=0)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg_tuned.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best n_neighbors: {'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 100}\n",
      "Best score: 4.581240633803964\n"
     ]
    }
   ],
   "source": [
    "p = reg_tuned.best_params_\n",
    "print(\"Best n_neighbors: {}\".format(p))\n",
    "score = reg_tuned.best_score_\n",
    "rsm = np.sqrt(-score)\n",
    "print(\"Best score: {}\".format(rsm))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [3.17992801 3.72887251 4.83618491 6.50196282 3.92813509]\n",
      "Reg mean: 4.435016667148898\n"
     ]
    }
   ],
   "source": [
    "# Setup the hyperparameter grid\n",
    "regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=500, random_state = 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Part from Exercise 156\n",
    "X = housing_df.iloc[:,:-1]\n",
    "y = housing_df.iloc[:, -1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reg rmse: [3.79117796 3.66572906 5.90361934 6.24188092 4.20210617]\n",
      "Reg mean: 4.760902690246876\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import AdaBoostRegressor\n",
    "regression_model_cv(AdaBoostRegressor(random_state = 0))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_Workshop",
   "language": "python",
   "name": "python_workshop"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
